Today we will learn:

  1. Omitted Variable Bias (once more)
  2. Measurement Error
  3. Multicollinearity
  4. Heteroscedasticity
  5. Outliers/Influential Observations

In other words, the goals are to:

  • use Monte Carlo simulations to understand different biases
  • see how we can use simulations for sensitivity analyses/robustness checks
  • detect outliers

# The first line sets an option for the final document that can be produced from
# the .Rmd file. Don't worry about it.
knitr::opts_chunk$set(echo = TRUE,
                      out.width="\\textwidth", # for larger figures 
                      attr.output = 'style="max-height: 200px;"'
                      )

# The next bit is quite powerful and useful. 
# First you define which packages you need for your analysis and assign it to 
# the p_needed object. 
p_needed <-
  c("viridis", "MASS", "sandwich", "lmtest", 
    "ggplot2", "patchwork", "dplyr")

# Now you check which packages are already installed on your computer.
# The function installed.packages() returns a vector with all the installed 
# packages.
packages <- rownames(installed.packages())
# Then you check which of the packages you need are not installed on your 
# computer yet. Essentially you compare the vector p_needed with the vector
# packages. The result of this comparison is assigned to p_to_install.
p_to_install <- p_needed[!(p_needed %in% packages)]
# If at least one element is in p_to_install you then install those missing
# packages.
if (length(p_to_install) > 0) {
  install.packages(p_to_install)
}
# Now that all packages are installed on the computer, you can load them for
# this project. Additionally the expression returns whether the packages were
# successfully loaded.
sapply(p_needed, require, character.only = TRUE)
##   viridis      MASS  sandwich    lmtest   ggplot2 patchwork     dplyr 
##      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE
# This is an option for stargazer tables
# It automatically adapts the output to html or latex,
# depending on whether we want a html or pdf file
stargazer_opt <- ifelse(knitr::is_latex_output(), "latex", "html")

# only relevant for ggplot2 plotting
# setting a global ggplot theme for the entire document to avoid 
# setting this individually for each plot 
theme_set(theme_classic() + # start with classic theme 
  theme(
    plot.background = element_blank(),# remove all background 
    plot.title.position = "plot", # move the plot title start slightly 
    legend.position = "bottom", # by default, put legend on the bottom
    axis.line.y = element_blank(), # remove line on y axis 
    axis.ticks.y = element_blank(), # remove ticks on y axis
    axis.text.y = element_blank() # remove text on y axis
  ))

Leftover from last week: Simulate Predicted Values

load(file = "raw-data/election2013_2.RData")

df <- as.data.frame(election2013_2)

We regressed leftvote on unemployment and east. Additionally, we include a multiplicative interaction term unemployment*east.

reg <- lm(leftvote ~ unemployment + east + 
            unemployment*east, 
          data = df)
summary(reg)
## 
## Call:
## lm(formula = leftvote ~ unemployment + east + unemployment * 
##     east, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8438 -0.8513 -0.1683  0.5337 11.4562 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.26144    0.30817   7.338 2.11e-12 ***
## unemployment       0.54859    0.04739  11.576  < 2e-16 ***
## east              16.10218    1.43807  11.197  < 2e-16 ***
## unemployment:east -0.13650    0.14407  -0.947    0.344    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.874 on 295 degrees of freedom
## Multiple R-squared:  0.9295, Adjusted R-squared:  0.9288 
## F-statistic:  1297 on 3 and 295 DF,  p-value: < 2.2e-16

We also coded a helpful function

quants_mean_fun <-  function(x) {
  c(quants = quantile(x, probs = c(0.025, 0.975)),
    mean = mean(x))
}

And here are all the simulation steps for expected values:

# Step 1 - Get the regression coefficients
beta_hat <- coef(reg)

# Step 2.1. Get the variance-covariance matrix
V_hat <- vcov(reg)

# Step 2.2. Draw from the multivariate normal distribution.
nsim <- 1000 
S <- mvrnorm(nsim, beta_hat, V_hat)

# Step 3 - Choose interesting covariate values.
unemp <- seq(0, 20, 0.1)  # Range from 0 to 20, in steps of 0.1
scenario_east <- cbind(1, unemp, 1, unemp * 1) 
scenario_west <- cbind(1, unemp, 0, unemp * 0)

# Step 4 - Calculate Quantities of Interest
EV_range_east <- S %*% t(scenario_east)
EV_range_west <- S %*% t(scenario_west)


# Quantiles, we again use apply and our quants.mean.fun
quants_range_east <- apply(EV_range_east, 2, quants_mean_fun)
quants_range_west <- apply(EV_range_west, 2, quants_mean_fun)

Visualize the results:

plot(
  unemp,
  quants_range_east[2,],
  pch = 19,
  cex = 0.3,
  bty = "n",
  las = 1,
  ylim = c(0, 35),
  ylab = "Voteshare (%)",
  main = "Expected Voteshare (Die Linke)",
  xlab = "Range of Unemployment",
  type = "n"
)

points(df$unemployment,
       df$leftvote,
       pch = 19,
       col = adjustcolor(viridis(3)[1], alpha = 0.8))

polygon(
  x = c(unemp, rev(unemp)),
  y = c(quants_range_east[1 ,], rev(quants_range_east[2 ,])),
  col = adjustcolor(viridis(3)[2], alpha = 0.5),
  border = NA
)
polygon(
  x = c(unemp, rev(unemp)),
  y = c(quants_range_west[1 ,], rev(quants_range_west[2 ,])),
  col = adjustcolor(viridis(3)[3], alpha = 0.5),
  border = NA
)


lines(unemp, quants_range_east[3,], col = viridis(3)[2])  # In this case I usually plot the polygons first and then the lines.

lines(unemp, quants_range_west[3,],
      col = viridis(3)[3])

# Add a legend

legend("topleft",
       lty = "solid",
       col = viridis(3)[2:3],
       legend = c("East", "West"),
       cex = 0.8,
       bty = "n")

Simulation of Predicted Values Y|X

Now we also want to simulate predicted values. What’s different from expected values?

  • Step 1 - Get the regression coefficients.
    • Exactly the same as above.
  • Step 2 - Generate sampling distribution.
    • Exactly the same as above.
  • Step 3 - Choose covariate values.
    • Exactly the same as above.
X_east <- c(1, mean(df$unemployment), 1, mean(df$unemployment) * 1) # East
X_west <- c(1, mean(df$unemployment), 0, mean(df$unemployment) * 0) # West

X <- as.matrix(rbind(X_east, X_west))

Step 4 - Calculate Quantities of Interest: Predicted Values

This is still the same as above.

EV_combined <- S %*% t(X)

Now we need to add something. Remember sigma_est (i.e. \(\hat\sigma\)) from last lab/lecture? That’s the fundamental uncertainty!

# Y ~ N(mu_c, sigma_est)
sigma_est <- sqrt(sum(residuals(reg)^2) / (nrow(df) - length(beta_hat)))

Y_hat_east <- EV_combined[,1] + rnorm(nsim, 0, sigma_est)
Y_hat_west <- EV_combined[,2] + rnorm(nsim, 0, sigma_est)


# Quantiles
quants_east <- quants_mean_fun(Y_hat_east)
quants_west <- quants_mean_fun(Y_hat_west)

Let’s plot it:

# Histogram Predicted Values West Germany
hist(Y_hat_west,
     las = 1,
     main = "Histogram of Predicted Values (West Germany)",
     col = viridis(3)[1], 
     border = "white")
abline(v = c(quants_west[1:3]), lty = 2, col = viridis(3)[3])

# Histogram Predicted Values East Germany
hist(Y_hat_east,
     las = 1,
     main = "Histogram of Predicted Values (East Germany)",
     col = viridis(3)[2], 
     border = "white")
abline(v = c(quants_east[1:3]), lty = 2, col = viridis(3)[3])

# We could put both distributions in one plot.
plot(density(Y_hat_west), 
     xlim = c(0,40), 
     lwd = 2 ,
     bty = "n", 
     yaxt = "n", 
     ylab = "", 
     xlab = "Left Voteshare in %",
     main = "Predicted Values for Voteshare",
     type = "n")
lines(density(Y_hat_west, from = min(Y_hat_west), to = max(Y_hat_west)), lwd = 2, col = viridis(3)[1])
lines(density(Y_hat_east, from = min(Y_hat_east), to = max(Y_hat_east)), lwd = 2, col = viridis(3)[2])
abline(v = c(quants_west[1:3]), lty = 2, col = viridis(3)[3])
abline(v = c(quants_east[1:3]), lty = 2, col = viridis(3)[3])

Let’s also do it for a range of unemployment.

Step 4 - Calculate Quantities of Interest over a range

EV_range_east <- S %*% t(scenario_east)
EV_range_west <- S %*% t(scenario_west)

Y_hat_range_east <- EV_range_east + rnorm(nsim, 0, sigma_est)
Y_hat_range_west <- EV_range_west + rnorm(nsim, 0, sigma_est)

# Quantiles, we again use apply and our quants_mean_fun
Y_quants_range_east <- apply(Y_hat_range_east, 2, quants_mean_fun)
Y_quants_range_west <- apply(Y_hat_range_west, 2, quants_mean_fun)

Plot it with polygons as confidence intervals.

plot(
  unemp,
  Y_quants_range_east[2, ],
  las = 1,
  bty = "n",
  pch = 19,
  cex = 0.3,
  ylim = c(0, 45),
  ylab = "Voteshare (%)",
  main = "Predicted Voteshare (Die Linke)",
  xlab = "Range of Unemployment",
  type = "n"
)

points(df$unemployment,
       df$leftvote ,
       pch = 19,
       col = adjustcolor(viridis(3)[1], alpha = 0.8))

polygon(
  x = c(unemp, rev(unemp)),
  y = c(Y_quants_range_east[1 , ], rev(Y_quants_range_east[2 , ])),
  col = adjustcolor(viridis(3)[2], alpha = 0.5),
  border = NA
)
polygon(
  x = c(unemp, rev(unemp)),
  y = c(Y_quants_range_west[1 , ], rev(Y_quants_range_west[2 , ])),
  col = adjustcolor(viridis(3)[3], alpha = 0.5),
  border = NA
)


lines(unemp, Y_quants_range_east[3, ],
      col = viridis(3)[2])

lines(unemp, Y_quants_range_west[3, ],
      col = viridis(3)[3])

# Add a legend

legend("topleft",
       lty = "solid",
       col = viridis(3)[2:3],
       legend = c("East", "West"),
       cex = 0.8,
       bty = "n")

# data frame for EV east
plot_pv_west <- data.frame(t(Y_quants_range_west))
names(plot_pv_west) <- c("ci_lo", "ci_hi", "mean")
plot_pv_west$unemp <- unemp
plot_pv_west$east <- 0

# data frame for EV east
plot_pv_east <- data.frame(t(Y_quants_range_east))
names(plot_pv_east) <- c("ci_lo", "ci_hi", "mean")
plot_pv_east$unemp <- unemp
plot_pv_east$east <- 1

# combine data frames
plot_pv <- rbind(plot_pv_west, plot_pv_east)

# plot
ggplot(
  data = df,
  aes(x = unemployment, y = leftvote,
      group = east)
) +
  geom_point(
    color = viridis(2, 0.5)[1]
  ) +
  # add mean expected values
  geom_line(data = plot_pv, aes(x = unemp, 
                                y = mean,
                                group = east)) +
  # add confidence intervals
  geom_line(data = plot_pv, aes(x = unemp, 
                                y = ci_lo,
                                group = east),
            linetype = "dashed") +
  geom_line(data = plot_pv, aes(x = unemp, 
                                y = ci_hi,
                                group = east),
            linetype = "dashed") +
  theme_classic() +
  labs(
    x = "Unemployment in %",
    y = "Predicted Voteshare (Die Linke) in %",
    color = "",
    title = "Left Voteshare and Unemployment"
  ) +
  theme(legend.position = "none")

The confidence bounds are wider because we take the fundamental uncertainty of our model into account. To see this we can plot the expected values plot and predicted values plot side by side.

par(mfrow=c(1,2))

# Plot "Expected Voteshares" of Die Linke

plot(
  unemp,
  quants_range_east[2,],
  pch = 19,
  cex = 0.3,
  bty = "n",
  las = 1,
  ylim = c(0, 45),
  ylab = "Voteshare (%)",
  main = "Expected Voteshare (Die Linke)",
  xlab = "Range of Unemployment",
  type = "n"
)

points(df$unemployment,
       df$leftvote,
       pch = 19,
       col = adjustcolor(viridis(3)[1], alpha = 0.8))

polygon(
  x = c(unemp, rev(unemp)),
  y = c(quants_range_east[1 ,], rev(quants_range_east[2 ,])),
  col = adjustcolor(viridis(3)[2], alpha = 0.5),
  border = NA
)
polygon(
  x = c(unemp, rev(unemp)),
  y = c(quants_range_west[1 ,], rev(quants_range_west[2 ,])),
  col = adjustcolor(viridis(3)[3], alpha = 0.5),
  border = NA
)


lines(unemp, quants_range_east[3,], col = viridis(3)[2])  # In this case I usually plot the polygons first and then the lines.

lines(unemp, quants_range_west[3,],
      col = viridis(3)[3])

# Add a legend

legend("topleft",
       lty = "solid",
       col = viridis(3)[2:3],
       legend = c("East", "West"),
       cex = 0.8,
       bty = "n")

# Plot "Predicted Voteshares" of Die Linke

plot(
  unemp,
  Y_quants_range_east[2, ],
  las = 1,
  bty = "n",
  pch = 19,
  cex = 0.3,
  ylim = c(0, 45),
  ylab = "Voteshare (%)",
  main = "Predicted Voteshare (Die Linke)",
  xlab = "Range of Unemployment",
  type = "n"
)

points(df$unemployment,
       df$leftvote ,
       pch = 19,
       col = adjustcolor(viridis(3)[1], alpha = 0.8))

polygon(
  x = c(unemp, rev(unemp)),
  y = c(Y_quants_range_east[1 , ], rev(Y_quants_range_east[2 , ])),
  col = adjustcolor(viridis(3)[2], alpha = 0.5),
  border = NA
)
polygon(
  x = c(unemp, rev(unemp)),
  y = c(Y_quants_range_west[1 , ], rev(Y_quants_range_west[2 , ])),
  col = adjustcolor(viridis(3)[3], alpha = 0.5),
  border = NA
)


lines(unemp, Y_quants_range_east[3, ],
      col = viridis(3)[2])

lines(unemp, Y_quants_range_west[3, ],
      col = viridis(3)[3])

We’ll work with the simulation approach a lot and you will get more and more familiar with it as we apply it in various contexts.

Omitted Variable Bias (once more)

Today, we introduce a new name for something we already did many times: Monte Carlo Simulation.

Steps for Monte Carlo Simulations:

  1. Generate fake data X.
  2. Choose true coefficients b.
  3. Generate true Y. (\(Y \sim N(\mu, \sigma^2)\))
  4. Sample from population.
  5. Run regressions (or whatever you can think of).
  6. Repeat n (e.g. 1,000) times, this generates a sampling distribution of coefficients.
  7. Compare to true values.

To do this we need the MASS library (loaded in the setup chunk).

Let’s start with an example you already know. What happens if we forget to include a variable in our model that is a confounder in the true data generating process?

Confounder{250px}

1. Generate fake data X.

We start with the means:

mus <- c(5, 10)
mus
## [1]  5 10

We also need standard deviations:

sds <- c(4, 5)
sds_mat <- diag(sds) 
sds_mat
##      [,1] [,2]
## [1,]    4    0
## [2,]    0    5

Correlation matrix (we assume some multicollinearity). Why?

cor_mat <- matrix(c(1, 0.3, 0.3, 1), nrow = 2, ncol = 2)
cor_mat 
##      [,1] [,2]
## [1,]  1.0  0.3
## [2,]  0.3  1.0

Convert to variance covariance matrix

varcov <- sds_mat %*% cor_mat %*% sds_mat 
# step-by-step
sds_mat %*% cor_mat
##      [,1] [,2]
## [1,]  4.0  1.2
## [2,]  1.5  5.0
sds_mat %*% cor_mat %*% sds_mat 
##      [,1] [,2]
## [1,]   16    6
## [2,]    6   25
#check sqrt of diag:
sqrt(diag(varcov)) #our initial sds
## [1] 4 5

Now we can generate X by taking draws from a multivariate normal.

X <- mvrnorm(n = 100000, 
             mu = mus, 
             Sigma = varcov)

head(X)
##           [,1]      [,2]
## [1,]  7.596385  1.181996
## [2,] -2.625701  7.948951
## [3,]  8.690780 15.689219
## [4,]  6.565599 18.779475
## [5,]  7.728005 12.368167
## [6,]  6.225189  9.820113
cor(X[, 1], X[, 2]) # check correlation (we specified it in cor_mat)
## [1] 0.3021148

2. Choose true coefficients \(b\).

b <- c(10, # intercept
       3,  # beta_1
       5)  # beta_2

3. Generate \(Y \sim N(\mu, \sigma^2)\)

mu <- cbind(1, X) %*% b  # systematic component

# Choose sigma_est
sigma_est <- 1

# Generate Y
Y <- rnorm(100000, 
           mean = mu,       # systematic component
           sd = sigma_est)  # stochastic component

# Create population data

pop <- data.frame(cbind(Y, X)) 
head(pop)
##           Y        V2        V3
## 1  38.46094  7.596385  1.181996
## 2  42.56915 -2.625701  7.948951
## 3 114.52981  8.690780 15.689219
## 4 123.16810  6.565599 18.779475
## 5  95.57617  7.728005 12.368167
## 6  78.17619  6.225189  9.820113

We can put steps 1 to 3 in one function.

gen_pop <- function(mus, varcov, b, sigma_est){
  
  X <- mvrnorm(n = 100000, 
               mu = mus, 
               Sigma = varcov) 
  
  mu_y <- cbind(1, X) %*% b
  
  Y <- rnorm(100000, 
             mean = mu_y, 
             sd = sigma_est)
  
  population <- data.frame(cbind(Y, X))
  
  return(population)
}

# See if our function works

pop <- gen_pop(mus = mus, 
               varcov = varcov, 
               b = b, 
               sigma_est = sigma_est)

head(pop)
##            Y        V2        V3
## 1  -6.048382  6.615709 -7.066335
## 2 105.345943 10.900004 12.606267
## 3  65.766866  2.741620  9.387026
## 4  64.751920  9.340421  5.159298
## 5 134.964377  5.885883 21.374333
## 6  79.446339  6.501881  9.783187

Next, we need to:

4. Sample from population

and…

5. Run regressions with and without omitting the variable.

We do that in one function:

drawncalc_ovb <- function(pop){
  
  pop_sample <- pop[sample(nrow(pop), size = 500), ] #dplyr::sample_n(pop, 500)
  
  # regression without omitted variable bias
  reg_nobias <- lm(Y ~ V2 + V3, data = pop_sample)
  
  # regression with omitted variable bias
  reg_bias <- lm(Y ~ V2, data = pop_sample)
  
  # store and return beta_1 estimate of both models
  b1_hat <- c(summary(reg_nobias)$coefficients[2, 1],
              summary(reg_bias)$coefficients[2, 1])
  
  return(b1_hat)
  }

6. Repeat 1,000 times to generate the sampling distributions.

b_hats_ovb <- t(replicate(1000, drawncalc_ovb(pop = pop)))

head(b_hats_ovb) # 2 cols with 1000 regression coefficients
##          [,1]     [,2]
## [1,] 2.998042 5.050788
## [2,] 2.996611 5.109977
## [3,] 3.017213 5.060918
## [4,] 3.015060 4.275836
## [5,] 2.998681 4.973375
## [6,] 3.007050 4.855191

7. Compare to true values.

Base R

two_cols <- viridis(2)
two_cols_alpha <- viridis(2, 0.5)

#We plot both the biased and unbiased estimates
plot(density(b_hats_ovb[, 1]), 
     xlim = c(2.5,6.5), 
     main = "Omitted Variable Bias", 
     col = two_cols[1],
     bty = "n", 
     las = 1,
     ylab = "")

polygon(density(b_hats_ovb[, 1]),
        col = two_cols_alpha[1],
        border = F)

lines(density(b_hats_ovb[, 2]), 
      col = two_cols[2])
polygon(density(b_hats_ovb[, 2]),
        col = two_cols_alpha[2],
        border = F)

abline(v = c(mean(b_hats_ovb[, 1]), 
             mean(b_hats_ovb[, 2])),
       col = two_cols, 
       lty = 2)

Omitted variable bias can dramatically bias our estimates of interest.

ggplot2

# put data to long format (better practice in ggplot2)
# instead of 2 columns, make one with another column identifying the model
b_hats_plot <- data.frame(
  estimate = c(b_hats_ovb[, 1], b_hats_ovb[, 2]),
  Model = rep(c("Unbiased", "Biased"), each = nrow(b_hats_ovb))
)


ggplot(b_hats_plot, 
  aes(x = estimate, fill = Model, color = Model)
) +
  geom_density() + # add two density plots 
  scale_color_viridis(discrete = T, direction = -1) + # change colors with viridis palette
  scale_fill_viridis(discrete = T, alpha = 0.5, direction = -1) + #  fill with viridis
  labs(
    title = "Omitted Variable Bias",
    x = "",
    y = "",
    fill = "Model"
  ) +
  geom_vline(
    xintercept = apply(b_hats_ovb, 2, mean), # object with two columns
    color = viridis(2) 
    ) +
  geom_hline( # remove the bottom line for cleaner look 
    yintercept = 0,
    color = "white" # grid line color
    ) 

Omitted variable bias can dramatically bias our estimates of interest.

Measurement Error

1. Generate fake data \(X\).

We start with the means

mus <- c(5, 10)

We also need standard deviations

sds <- c(4, 5)
sds_mat <- diag(sds) # construct a diagonal matrix
sds_mat
##      [,1] [,2]
## [1,]    4    0
## [2,]    0    5

Correlation matrix (for now we assume no multicollinearity).

cor_mat <- matrix(data = c(1, 0, 0, 1), nrow = 2, ncol = 2)
cor_mat
##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1

Convert to variance-covariance matrix:

varcov <- sds_mat %*% cor_mat %*% sds_mat 
varcov
##      [,1] [,2]
## [1,]   16    0
## [2,]    0   25

Now we can generate X

X <- mvrnorm(n = 100000, 
             mu = mus, 
             Sigma = varcov)
cor(X[, 1], X[, 2]) # check correlation
## [1] 0.001342222

2. Choose true coefficients \(b\)

b <- c(10, 3, 5)

3. Generate \(Y\) ~ \(N(\mu,\sigma^2)\)

mu <- cbind(1, X) %*% b  # systematic component

# Choose sigma_est
sigma_est <- 1

# Generate Y
Y <- rnorm(100000, 
           mean = mu,        # systematic component
           sd = sigma_est)   # stochastic component

Now we need to add some measurement error in X (e.g., in V2)

sigma_err <- 0.5

X[, 1] <- X[, 1] + rnorm(length(X[, 1]), 0, sigma_err) # X-star from the lecture

# Population data

pop <- data.frame(cbind(Y, X)) 
head(pop)
##             Y        V2        V3
## 1  75.0174427  7.942766  7.896190
## 2 104.2665025  8.091110 14.441986
## 3  85.2890589  2.425802 13.489344
## 4  54.2641485 -3.586130 10.781174
## 5  -0.9287311 -1.636571 -1.270940
## 6  54.2242427 11.522858  2.387464

We can modify our gen_pop function accordingly.

gen_pop <- function(mus, varcov, b, sigma_est, sigma_err){
  
  X <- mvrnorm(n = 100000, mu = mus, Sigma = varcov) 
  
  mu_y <- cbind(1, X) %*% b
  
  Y <- rnorm(100000, 
             mu_y, 
             sigma_est)
  
  # Add measurement error in V2
  X[, 1] <- X[, 1] + rnorm(length(X[, 1]), 0, sigma_err) 
  
  population <- data.frame(cbind(Y, X))
  
  return(population)
}

Perfect, let’s set up some scenarios using our function.

a) No measurement error

pop1 <- gen_pop(mus = mus, 
                varcov = varcov, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 0)

b) Small measurement error

pop2 <- gen_pop(mus = mus, 
                varcov = varcov, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 0.25)

c) High measurement error

pop3 <- gen_pop(mus = mus, 
                varcov = varcov, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 0.5)

d) Very high measurement error

pop4 <- gen_pop(mus = mus, 
                varcov = varcov, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 1)

Next, we need to:

4. Sample from population

and …

5. Run regressions with the different measurement error scenarios.

We do that in one function:

drawncalc <- function(pop){
  
  pop_sample <- pop[sample(nrow(pop), size = 500),]
  
  reg <- lm(Y ~ V2 + V3,
            data = pop_sample)
  
  # get coefficient and standard error
  b1_hat <- summary(reg)$coefficients[2, 1:2] 
  
  return(b1_hat)
}

6. Repeat 1,000 times to generate the sampling distributions.

We do this for the different scenarios.

# without measurement error
b_hats_no <- t(replicate(1000, drawncalc(pop = pop1)))

# with small measurement error
b_hats_small <- t(replicate(1000, drawncalc(pop = pop2)))

# with high measurement error 
b_hats_high <- t(replicate(1000, drawncalc(pop = pop3)))

# with very high measurement error
b_hats_veryhigh <- t(replicate(1000, drawncalc(pop = pop4)))

7. Compare to true values.

Plots help us here.

Base R

four_cols <- viridis(4)
four_cols_alpha <- viridis(4, 0.5)

par(mfrow = c(1, 2))

# first plot

plot(density(b_hats_no[, 1]), 
     xlim = c(2.7, 3.05), 
     main = "Point Estimate",
     bty = "n", 
     las = 1, 
     yaxt = "n", 
     ylab = "",
     xlab = "",
     type = "n")

polygon(density(b_hats_no[, 1]),
        col = four_cols_alpha[1],
        border = F)
polygon(density(b_hats_small[, 1]),
        col = four_cols_alpha[2],
        border = F)
polygon(density(b_hats_high[, 1]),
        col = four_cols_alpha[3],
        border = F)
polygon(density(b_hats_veryhigh[, 1]),
        col = four_cols_alpha[4],
        border = F)

lines(density(b_hats_no[, 1]), 
      col = four_cols[1])
lines(density(b_hats_small[, 1]), 
      col = four_cols[2])
lines(density(b_hats_high[, 1]), 
      col = four_cols[3])
lines(density(b_hats_veryhigh[, 1]), 
      col = four_cols[4])

abline(v = c(mean(b_hats_no[, 1]), 
             mean(b_hats_small[, 1]),
             mean(b_hats_high[, 1]), 
             mean(b_hats_veryhigh[, 1])),
       col = four_cols, lty = 2)

legend("topleft", 
       legend = c("No", "Small", "High", "Very high"),
       col = four_cols, 
       lty = "dashed", 
       cex = 0.7,
       bty = "n")

# second plot 

plot(density(b_hats_no[, 2]), 
     xlim = c(0.009, 0.04), 
     main = "Standard Error",
     bty = "n", 
     las = 1, 
     yaxt = "n", 
     ylab = "",
     xlab = "",
     type = "n")

polygon(density(b_hats_no[, 2]),
        col = four_cols_alpha[1],
        border = F)
polygon(density(b_hats_small[, 2]),
        col = four_cols_alpha[2],
        border = F)
polygon(density(b_hats_high[, 2]),
        col = four_cols_alpha[3],
        border = F)
polygon(density(b_hats_veryhigh[, 2]),
        col = four_cols_alpha[4],
        border = F)

lines(density(b_hats_no[, 2]), 
      col = four_cols[1])
lines(density(b_hats_small[, 2]), 
      col = four_cols[2])
lines(density(b_hats_high[, 2]), 
      col = four_cols[3])
lines(density(b_hats_veryhigh[, 2]), 
      col = four_cols[4])

abline(v = c(mean(b_hats_no[, 2]), 
             mean(b_hats_small[, 2]),
             mean(b_hats_high[, 2]), 
             mean(b_hats_veryhigh[, 2])),
       col = four_cols, 
       lty = 2)

How is the relationship between measurement error in V2 and the point estimate and standard error?

plot(x = c(0, 0.25, 0.5, 1), 
     y = c(mean(b_hats_no[, 1]), 
           mean(b_hats_small[, 1]),
           mean(b_hats_high[, 1]), 
           mean(b_hats_veryhigh[, 1])),
     pch = 19,
     col = viridis(1, 0.8),
     main = "Measurement Error and Point Estimate", 
     xlab = "Measurement Error", 
     ylab = "Point Estimate", 
     bty = "n", 
     las = 1)

plot(x = c(0, 0.25, 0.5, 1), 
     y = c(mean(b_hats_no[, 2]), 
           mean(b_hats_small[, 2]),
           mean(b_hats_high[, 2]), 
           mean(b_hats_veryhigh[, 2])),
     pch = 19,
     col = viridis(1, 0.8),
     main = "Measurement Error and Standard Error", 
     xlab = "Measurement Error", 
     ylab = "Standard Error",
     bty = "n", 
     las = 1)

This attenuation bias, which is introduced through measurement error in X, biases our cofficients towards zero.

In the exercise section, you will see how measurement error in Y affects our coefficients.

ggplot2

b_hats <- as.data.frame(rbind(b_hats_no,
                              b_hats_small,
                              b_hats_high,
                              b_hats_veryhigh))
b_hats$error <-
  factor(rep(c("No", "Small", "High", "Very High"), each = nrow(b_hats) / 4), 
         levels = c("No", "Small", "High", "Very High"))


# first plot
p1 <- ggplot(data = b_hats, aes(x = Estimate,  fill = error, color = error)) +
  labs(
    title = "Point Estimate",
    x = "",
    y = "",
    fill = "Measurement Error",
    color = "Measurement Error"
  ) +
  geom_density() + # add densities 
  geom_vline( # manually add lines with means 
    xintercept = b_hats %>% # take b_hats 
      group_by(error) %>% # apply all calculations within groups of error
      summarise(mean = mean(Estimate)) %>% #  take mean of Estimate column 
      pull(mean), # pull the mean column 
    linetype = "dashed",
    color = viridis(4)
  ) +
  scale_fill_viridis(discrete = T, alpha = 0.5) +
  scale_color_viridis(discrete = T) 

# second plot 

p2 <- ggplot(data = b_hats, aes(x = `Std. Error`, fill = error, color= error)) +
  labs(
    title = "Standard Error",
    x = "",
    y = "",
    fill = "Measurement Error",
    color = "Measurement Error"
  ) +
  geom_density() +
  geom_vline( # manually add lines with means 
    xintercept = b_hats %>% # take b_hats 
      group_by(error) %>% # apply all calculations within groups of error
      summarise(mean = mean(`Std. Error`)) %>% #  take mean of `Std. Error` column 
      pull(mean), # pull the mean column 
    linetype = "dashed",
    color = viridis(4)
  ) +
  scale_fill_viridis(discrete = T, alpha = 0.5) +
  scale_color_viridis(discrete = T)

# put plots together with patchwork package 
p1 + p2 + plot_layout(guides = "collect") &
  geom_hline( # remove the bottom line for cleaner look 
    yintercept = 0,
    color = "white" # grid line color
    )

How is the relationship between measurement error in V2 and the point estimate and standard error?

p1 <- b_hats %>% # take b_hats 
      group_by(error) %>% # apply all calculations within groups of error
      summarise_all(mean) %>% # summarize all variables
  ggplot() + 
  geom_point(
    aes(x = c(0, 0.25, 0.5, 1),
        y = Estimate)
  ) + 
  labs(
    x = "Measurement Error",
    y = "Point Estimate"
  ) +
  theme_classic() # restore to classic theme with no modifications 

p2 <- b_hats %>% # take b_hats 
      group_by(error) %>% # apply all calculations within groups of error
      summarise_all(mean) %>% 
  ggplot() + 
  geom_point(
    aes(x = c(0, 0.25, 0.5, 1),
        y = `Std. Error`)
  ) + 
  labs(
    x = "Measurement Error",
    y = "Standard Error"
  ) +
  theme_classic()

p1 + p2 + plot_annotation(title = "The Effect of Measurement Error in X on Estimates")

This attenuation bias, which is introduced through measurement error in X, biases our cofficients towards zero.

In the exercise section, you will see how measurement error in Y affects our coefficients.

Multicollinearity

Steps 1 to 3: Again, we only need to modify our gen_pop function from above. What changes?

gen_pop <- function(mus, varcov, b, sigma_est){
  
  # Generate IV
  X <- mvrnorm(n = 100000, 
               mu = mus, 
               Sigma = varcov) 
  
  mu_y <- cbind(1, X) %*% b
  
  Y <- rnorm(100000, 
             mean = mu_y, 
             sd = sigma_est)
  
  population <- data.frame(cbind(Y, X))
  
  return(population)
}

Let’s set up some scenarios:

# a) No multicollinearity

cor_mat1 <- matrix(c(1, 0, 0, 1), 
                   nrow = 2, 
                   ncol = 2)

varcov1 <- sds_mat %*% cor_mat1 %*% sds_mat 

pop1 <- gen_pop(mus = mus, 
                varcov = varcov1, 
                b = b, 
                sigma_est = sigma_est)

# b) Small multicollinearity

cor_mat2 <- matrix(c(1, 0.3, 0.3, 1), 
                   nrow = 2, 
                   ncol = 2)

varcov2 <- sds_mat %*% cor_mat2 %*% sds_mat 

pop2 <- gen_pop(mus = mus, 
                varcov = varcov2, 
                b = b, 
                sigma_est = sigma_est)

# c) High multicollinearity

cor_mat3 <- matrix(c(1, 0.6, 0.6, 1), nrow = 2, ncol = 2)

varcov3 <- sds_mat %*% cor_mat3 %*% sds_mat 

pop3 <- gen_pop(mus = mus, 
                varcov = varcov3, 
                b = b, 
                sigma_est = sigma_est)

# d) Very high multicollinearity

cor_mat4 <- matrix(c(1, 0.9, 0.9, 1), 
                   nrow = 2, 
                   ncol = 2)

varcov4 <- sds_mat %*% cor_mat4 %*% sds_mat 

pop4 <- gen_pop(mus = mus, 
                varcov = varcov4, 
                b = b, 
                sigma_est = sigma_est)

Steps 4 to 6 are the same as above. (And we can use the same function.)

b_hats_no <- t(replicate(1000, drawncalc(pop = pop1)))

b_hats_small <- t(replicate(1000, drawncalc(pop = pop2)))

b_hats_high <- t(replicate(1000, drawncalc(pop = pop3)))

b_hats_veryhigh <- t(replicate(1000, drawncalc(pop = pop4)))

7. To compare to the true values we make some plots

Base R

par(mfrow = c(1, 2))

# first plot

plot(density(b_hats_no[, 1]), 
     xlim = c(2.9, 3.1), 
     main = "Point Estimate",
     bty = "n", 
     las = 1, 
     yaxt = "n", 
     ylab = "",
     xlab = "",
     type = "n")

polygon(density(b_hats_no[, 1]),
        col = four_cols_alpha[1],
        border = F)
polygon(density(b_hats_small[, 1]),
        col = four_cols_alpha[2],
        border = F)
polygon(density(b_hats_high[, 1]),
        col = four_cols_alpha[3],
        border = F)
polygon(density(b_hats_veryhigh[, 1]),
        col = four_cols_alpha[4],
        border = F)

lines(density(b_hats_no[, 1]), 
      col = four_cols[1])
lines(density(b_hats_small[, 1]), 
      col = four_cols[2])
lines(density(b_hats_high[, 1]), 
      col = four_cols[3])
lines(density(b_hats_veryhigh[, 1]), 
      col = four_cols[4])

abline(v = c(mean(b_hats_no[, 1]), 
             mean(b_hats_small[, 1]),
             mean(b_hats_high[, 1]), 
             mean(b_hats_veryhigh[, 1])),
       col = four_cols, lty = 2)

legend("topleft", 
       legend = c("No", "Small", "High", "Very high"),
       col = four_cols, 
       lty = "dashed", 
       cex = 0.7,
       bty = "n")

# second plot 

plot(density(b_hats_no[, 2]), 
     xlim = c(0.009, 0.031), 
     main = "Standard Error",
     bty = "n", 
     las = 1, 
     yaxt = "n", 
     ylab = "",
     xlab = "",
     type = "n")

polygon(density(b_hats_no[, 2]),
        col = four_cols_alpha[1],
        border = F)
polygon(density(b_hats_small[, 2]),
        col = four_cols_alpha[2],
        border = F)
polygon(density(b_hats_high[, 2]),
        col = four_cols_alpha[3],
        border = F)
polygon(density(b_hats_veryhigh[, 2]),
        col = four_cols_alpha[4],
        border = F)

lines(density(b_hats_no[, 2]), 
      col = four_cols[1])
lines(density(b_hats_small[, 2]), 
      col = four_cols[2])
lines(density(b_hats_high[, 2]), 
      col = four_cols[3])
lines(density(b_hats_veryhigh[, 2]), 
      col = four_cols[4])

abline(v = c(mean(b_hats_no[, 2]), 
             mean(b_hats_small[, 2]),
             mean(b_hats_high[, 2]), 
             mean(b_hats_veryhigh[, 2])),
       col = four_cols, 
       lty = 2)

How is the relationship between multicollinearity and standard error?

plot(x = c(0, 0.3, 0.6, 0.9), 
     y = c(mean(b_hats_no[, 2]),
           mean(b_hats_small[, 2]),
           mean(b_hats_high[, 2]), 
           mean(b_hats_veryhigh[, 2])),
     pch = 19,
     las = 1, 
     col = viridis(1, 0.75),
     main = "Multicollinearity and Standard Error", 
     xlab = "Correlation", 
     ylab = "Standard Error",
     bty = "n")

Multicollinearity increases the sampling variance of the OLS coefficients. This in turn increases the SEs and CIs of the coefficients. The good news is that even strong multicollinearity does not bias our coefficients.

ggplot2

b_hats <- as.data.frame(rbind(b_hats_no,
                              b_hats_small,
                              b_hats_high,
                              b_hats_veryhigh))
b_hats$mc <- factor(rep(c("No", "Small", "High", "Very High"), each = nrow(b_hats) / 4),
                    levels =  c("No", "Small", "High", "Very High"))


# first plot

p1 <- ggplot(data = b_hats, aes(x = Estimate,  fill = mc, color = mc)) +
  labs(
    title = "Point Estimate",
    x = "",
    y = "",
    fill = "Multicollinearity",
    color = "Multicollinearity"
  ) +
  geom_density() + # add densities 
  geom_vline( # manually add lines with means 
    xintercept = b_hats %>% # take b_hats 
      group_by(mc) %>% # apply all calculations within groups of mc
      summarise(mean = mean(Estimate)) %>% #  take mean of Estimate column 
      pull(mean), # pull the mean column 
    linetype = "dashed",
    color = viridis(4)
  ) +
  scale_fill_viridis(discrete = T, alpha = 0.5) +
  scale_color_viridis(discrete = T)

p2 <- ggplot(data = b_hats, aes(x = `Std. Error`, fill = mc, color = mc)) +
  labs(
    title = "Standard Error",
    x = "",
    y = "",
    fill = "Multicollinearity",
    color = "Multicollinearity"
  ) +
   geom_density() + # add densities 
  geom_vline( # manually add lines with means 
    xintercept = b_hats %>% # take b_hats 
      group_by(mc) %>% # apply all calculations within groups of mc
      summarise(mean = mean(`Std. Error`)) %>% #  take mean of `Std. Error` column 
      pull(mean), # pull the mean column 
    linetype = "dashed",
    color = viridis(4)
  ) +
  scale_fill_viridis(discrete = T, alpha = 0.5) +
  scale_color_viridis(discrete = T) 

p1 + p2 + plot_layout(guides = "collect") &
  geom_hline( # remove the bottom line for cleaner look 
    yintercept = 0,
    color = "white" # grid line color
    )

How is the relationship between multicollinearity and standard error?

b_hats %>% # take b_hats
  group_by(mc) %>% # apply all calculations within groups of mc
  summarise_all(mean) %>%
  ggplot() +
  geom_point(aes(x =  c(0, 0.3, 0.6, 0.9),
                 y = `Std. Error`)) +
  labs(x = "Multicollinearity",
       title = "Multicollinearity and Standard Error",
       y = "Standard Error") +
  theme_classic() +
  scale_x_continuous(breaks = c(0, 0.3, 0.6, 0.9))

Multicollinearity increases the sampling variance of the OLS coefficients. This in turn increases the SEs and CIs of the coefficients. The good news is that even strong multicollinearity does not bias our coefficients.

Heteroscedasticity

# Generate some data

x <- runif(100, 0.2, 1)
e <- rnorm(100, 0, 0.5)

y1 <- 2 * x + e # This creates homoscedastic data.

If the variance of the error term is correlated with \(x\), we get heteroscedastic data.

y2 <- 2 * x + e * x^2 

Let’s have a look at it.

Base R

par(mfrow = c(1, 2))
# Homoscedastic data
plot(x = x, 
     y = y1, 
     pch = 19,
     col = viridis(1, 0.75)[1],
     bty = "n", 
     las = 1, 
     main = "Homoskedastic data")

# Heteroskedastic data
plot(x = x, 
     y = y2, 
     pch = 19,
     col = viridis(1, 0.75)[1],
     bty = "n", 
     las = 1, 
     main = "Heteroskedastic data")

ggplot2

# Homoscedastic data
p_hom <- ggplot() +
  geom_point(aes(x = x, y = y1),
             size = 2,
             color = viridis(1, 0.75)[1]
             ) + 
  labs(
    title = "Homoskedastic data",
    x = "x",
    y = "y1"
  ) +
  theme_classic() 
# Heteroskedastic data
p_het <- ggplot() +
  geom_point(aes(x = x, y = y2),
    color = viridis(1, 0.75)[1], 
    size = 2
    ) + 
  labs(
    title = "Heteroskedastic data",
    x = "x",
    y = "y2"
  ) +
  theme_classic()
p_hom + p_het

Now we run the regressions.

reg_hom <- lm(y1 ~ x) 
reg_het <- lm(y2 ~ x)

Have a look at the residuals.

Base R

par(mfrow = c(1, 2))

plot(x = fitted.values(reg_hom), 
     y = residuals(reg_hom),
     pch = 19, 
     xlab = "Fitted Values", 
     ylab = "Residuals",
     main = "Homoscedasticity",
     col = viridis(1, 0.75),
     bty = "n", 
     ylim = c(-1.5, 1),
     las = 1)
abline(h = 0)

plot(x = fitted.values(reg_het), 
     y = residuals(reg_het),
     pch = 19, 
     xlab = "Fitted Values", 
     ylab = "Residuals",
     main = "Heteroscedasticity",
     col = viridis(1, 0.75),
     bty = "n", 
     ylim = c(-1.5, 1),
     las = 1)
abline(h = 0)

ggplot2

p1 <- ggplot(data = reg_hom$model,
             aes(x = x, y = residuals(reg_hom))) +
  geom_point(color = viridis(1, 0.75), # add points
             size = 2) + 
  theme_classic() + # change the appearance
  labs(x = "X",
       y = "Residuals",
       title = "Homoscedasticity")  +
  geom_hline(yintercept = 0, # add the line
             size = 0.5) +
  scale_y_continuous(limits = c(-1.5, 2))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- ggplot(data = reg_hom$model,
             aes(x = x, y = residuals(reg_het))) +
  geom_point(color = viridis(1, 0.75), # add points
             size = 2) + 
  theme_classic() + # change the appearance
  labs(x = "X",
       y = "Residuals",
       title = "Heteroscedasticity")  +
  geom_hline(yintercept = 0, # add the line
             size = 0.5) +
  scale_y_continuous(limits = c(-1.5, 2))

p1 + p2 

What can we do about it? Use Robust Standard Errors: e.g. sandwich package

data <- as.data.frame(cbind(y2, x))

reg_ols <- lm(y2 ~ x,
              data = data)

summary(reg_ols)
## 
## Call:
## lm(formula = y2 ~ x, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64178 -0.08214 -0.02146  0.06237  0.92234 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.05062    0.05373   0.942    0.349    
## x            1.91473    0.09009  21.254   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2097 on 98 degrees of freedom
## Multiple R-squared:  0.8217, Adjusted R-squared:  0.8199 
## F-statistic: 451.7 on 1 and 98 DF,  p-value: < 2.2e-16
vcovHC(reg_ols)
##              (Intercept)            x
## (Intercept)  0.002200636 -0.005187228
## x           -0.005187228  0.013099920
sqrt(diag(vcovHC(reg_ols)))
## (Intercept)           x 
##  0.04691094  0.11445488
coeftest(reg_ols, vcov. = vcovHC)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.050618   0.046911   1.079   0.2832    
## x           1.914733   0.114455  16.729   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

However, differences between robust and normal standard errors often reveal problems with your model specification.

See for example: King and Roberts 2015: How Robust Standard Errors Expose Methodological Problems They Do Not Fix, and What to Do About It.

In AQM next semester you will learn how to model heteroscedastic regressions.

Outliers/Influential Observations

# Some fake data

x <- c(rnorm(99, 0, 1), 5)
y <- c(rnorm(99, 1, 1), 20)

data <- data.frame(cbind(x, y))
m1 <- lm(y ~ x, data = data)
summary(m1)
## 
## Call:
## lm(formula = y ~ x, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1556 -0.9727 -0.0238  0.8218 14.4951 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.2457     0.1911   6.520 3.07e-09 ***
## x             0.8519     0.1740   4.896 3.85e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.91 on 98 degrees of freedom
## Multiple R-squared:  0.1966, Adjusted R-squared:  0.1884 
## F-statistic: 23.97 on 1 and 98 DF,  p-value: 3.847e-06

Let’s plot the values

Base R

plot(x, y,
     pch = 19, 
     col = two_cols_alpha[1],
     bty = "n", 
     las = 1)
abline(lm(y ~ x), col = two_cols[1])

ggplot2

ggplot() +
  geom_point(aes(x = x, y = y), 
             color = viridis(1, alpha = 0.5), # add points
             size = 2) + 
  geom_abline(intercept = m1$coefficients[1], # add the line
              slope = m1$coefficients[2],
              color = viridis(1),
              size = 1) +
  theme_classic()

Cook’s Distance

# calculate Cook's distance
D <- cooks.distance(m1)

# ...and plot it
plot(D, 
     main = "Cook's Distance",
     pch = 19, 
     col = two_cols_alpha[1],
     bty = "n", 
     las = 1)

Drop observations (here it is just the one observation).

s <- D < 1

Run the regression without the outlier.

m2 <- lm(y ~ x, 
         data = data[s,])
summary(m2)
## 
## Call:
## lm(formula = y ~ x, data = data[s, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.22717 -0.72367  0.04444  0.67031  2.45732 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.06721    0.09662  11.046   <2e-16 ***
## x            0.08596    0.09832   0.874    0.384    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9605 on 97 degrees of freedom
## Multiple R-squared:  0.007818,   Adjusted R-squared:  -0.002411 
## F-statistic: 0.7643 on 1 and 97 DF,  p-value: 0.3841

Base R

Let’s compare the two models.

par(mfrow = c(1, 2))

# First the model without excluding the outlier
plot(x = x, 
     y = y, 
     xlim = c(-3, 5), 
     ylim = c(-1.5, 20), 
     main = "Model with Outlier",
     pch = 19, 
     col = two_cols_alpha[1],
     bty = "n", las = 1)
abline(m1, col = two_cols[1])

# Second the model excluding the outlier
plot(x = x[s], 
     y = y[s], 
     xlim = c(-3, 5), 
     ylim = c(-1.5, 20), 
     main = "Model without Outlier",
     pch = 19, 
     col = two_cols_alpha[1],
     bty = "n", 
     las = 1)
abline(m2, col = two_cols[1])

Outliers can heavily influence our estimates. Usually, we do not want the results of regression analyses to depend on few influential outliers.

ggplot2

# First the model without excluding the outlier    
p1 <- ggplot() +
  geom_point(aes(x = x, y = y), 
             color = viridis(1, alpha = 0.5), # add points
             size = 2) + 
  geom_abline(intercept = m1$coefficients[1], # add the line
              slope = m1$coefficients[2],
              color = viridis(1),
              size = 1) +
  theme_classic() +
  labs(title = "Model with outlier") 

# Second the model excluding the outlier
p2 <- ggplot() +
  geom_point(aes(x = x, y = y), 
             color = viridis(1, alpha = 0.5), # add points
             size = 2) + 
  geom_abline(intercept = m2$coefficients[1], # add the line
              slope = m2$coefficients[2],
              color = viridis(1),
              size = 1) +
  theme_classic() +
  labs(title = "Model without outlier") 

p1 + p2

Outliers can heavily influence our estimates. Usually, we do not want the results of regression analyses to depend on few influential outliers.

Exercise Section: Measurement error in Y

What happens if we have measurement error for the dependent variable Y? Run a Monte Carlo simulation for measurement error in Y (Without measurement error in V2 or V3.)

For this, you will have to adjust the gen_pop function. Note: To add measurement error in Y, you need to add only one line of code.

gen_pop <- function(mus, varcov, b, sigma_est, sigma_err){
  
  # Generate IV
  X <- mvrnorm(n = 100000, 
               mu = mus, 
               Sigma = varcov)
  
  mu_y <- cbind(1,X) %*% b
  
  Y <- rnorm(100000, 
             mean = mu_y, 
             sd = sigma_est)
  
  # Add measurement error in Y
  # Y <- ???
  
  population <- data.frame(cbind(Y, X))
  
  return(population)
}

Now run the following code to see how different levels of measurement error in Y affect our estimates:

We assume no multicollinearity

cor_mat1 <- matrix(c(1, 0, 0, 1), 
                   nrow = 2, 
                   ncol = 2)

varcov1 <- sds_mat %*% cor_mat1 %*% sds_mat 

# No measurement error
pop1 <- gen_pop(mus = mus, 
                varcov = varcov1, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 0)

# Small measurement error
pop2 <- gen_pop(mus = mus, 
                varcov = varcov1, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 0.25)

# High measurement error
pop3 <- gen_pop(mus = mus, 
                varcov = varcov1, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 0.5)

# Very high measurement error
pop4 <- gen_pop(mus = mus, 
                varcov = varcov1, 
                b = b,
                sigma_est = sigma_est, 
                sigma_err = 1)

Simulate n times

b_hats_no <- t(replicate(1000, drawncalc(pop = pop1)))

b_hats_small <- t(replicate(1000, drawncalc(pop = pop2)))

b_hats_high <- t(replicate(1000, drawncalc(pop = pop3)))

b_hats_veryhigh <- t(replicate(1000, drawncalc(pop = pop4)))

And plot the results

Base R

par(mfrow = c(1, 2))

# first plot

plot(density(b_hats_no[, 1]), 
     xlim = c(2.7, 3.05), 
     main = "Point Estimate",
     bty = "n", 
     las = 1, 
     yaxt = "n", 
     ylab = "",
     xlab = "",
     type = "n")

polygon(density(b_hats_no[, 1]),
        col = four_cols_alpha[1],
        border = F)
polygon(density(b_hats_small[, 1]),
        col = four_cols_alpha[2],
        border = F)
polygon(density(b_hats_high[, 1]),
        col = four_cols_alpha[3],
        border = F)
polygon(density(b_hats_veryhigh[, 1]),
        col = four_cols_alpha[4],
        border = F)

lines(density(b_hats_no[, 1]), 
      col = four_cols[1])
lines(density(b_hats_small[, 1]), 
      col = four_cols[2])
lines(density(b_hats_high[, 1]), 
      col = four_cols[3])
lines(density(b_hats_veryhigh[, 1]), 
      col = four_cols[4])

abline(v = c(mean(b_hats_no[, 1]), 
             mean(b_hats_small[, 1]),
             mean(b_hats_high[, 1]), 
             mean(b_hats_veryhigh[, 1])),
       col = four_cols, lty = 2)

legend("topleft", 
       legend = c("No", "Small", "High", "Very high"),
       col = four_cols, 
       lty = "dashed", 
       cex = 0.7,
       bty = "n")

# second plot 

plot(density(b_hats_no[, 2]), 
     xlim = c(0.009, 0.04), 
     main = "Standard Error",
     bty = "n", 
     las = 1, 
     yaxt = "n", 
     ylab = "",
     xlab = "",
     type = "n")

polygon(density(b_hats_no[, 2]),
        col = four_cols_alpha[1],
        border = F)
polygon(density(b_hats_small[, 2]),
        col = four_cols_alpha[2],
        border = F)
polygon(density(b_hats_high[, 2]),
        col = four_cols_alpha[3],
        border = F)
polygon(density(b_hats_veryhigh[, 2]),
        col = four_cols_alpha[4],
        border = F)

lines(density(b_hats_no[, 2]), 
      col = four_cols[1])
lines(density(b_hats_small[, 2]), 
      col = four_cols[2])
lines(density(b_hats_high[, 2]), 
      col = four_cols[3])
lines(density(b_hats_veryhigh[, 2]), 
      col = four_cols[4])

abline(v = c(mean(b_hats_no[, 2]), 
             mean(b_hats_small[, 2]),
             mean(b_hats_high[, 2]), 
             mean(b_hats_veryhigh[, 2])),
       col = four_cols, 
       lty = 2)

What do you observe?

Because the measurement error is uncorrelated with X, the OLS estimators are unbiased, but the variance is inflated (larger variances, larger standard errors).

ggplot2

b_hats <- as.data.frame(rbind(b_hats_no,
                              b_hats_small,
                              b_hats_high,
                              b_hats_veryhigh))
b_hats$error <-
  factor(rep(c("No", "Small", "High", "Very High"), each = nrow(b_hats) / 4), 
         levels = c("No", "Small", "High", "Very High"))


# first plot

p1 <- ggplot(data = b_hats, aes(x = Estimate,  fill = error, color = error)) +
  labs(
    title = "Point Estimate",
    x = "",
    y = "",
    fill = "Measurement Error",
    color = "Measurement Error"
  ) +
  geom_density() + # add densities 
  geom_vline( # manually add lines with means 
    xintercept = b_hats %>% # take b_hats 
      group_by(error) %>% # apply all calculations within groups of error
      summarise(mean = mean(Estimate)) %>% #  take mean of Estimate column 
      pull(mean), # pull the mean column 
    linetype = "dashed",
    color = viridis(4)
  ) +
  scale_fill_viridis(discrete = T, alpha = 0.5) +
  scale_color_viridis(discrete = T) +
  theme(legend.title = element_text()) +
  geom_hline( # remove the bottom line for cleaner look 
    yintercept = 0,
    color = "white" # grid line color
    )

# second plot 

p2 <- ggplot(data = b_hats, aes(x = `Std. Error`, fill = error, color= error)) +
  labs(
    title = "Standard Error",
    x = "",
    y = "",
    fill = "Measurement Error",
    color = "Measurement Error"
  ) +
  geom_density() +
  geom_vline( # manually add lines with means 
    xintercept = b_hats %>% # take b_hats 
      group_by(error) %>% # apply all calculations within groups of error
      summarise(mean = mean(`Std. Error`)) %>% #  take mean of `Std. Error` column 
      pull(mean), # pull the mean column 
    linetype = "dashed",
    color = viridis(4)
  ) +
  scale_fill_viridis(discrete = T, alpha = 0.5) +
  scale_color_viridis(discrete = T) +
  theme(legend.title = element_text()) +
  geom_hline( # remove the bottom line for cleaner look 
    yintercept = 0,
    color = "white" # grid line color
    )

# put plots together with patchwork package 
p1 + p2 + plot_layout(guides = "collect") +
  plot_annotation(title = "The Effect of Measurement Error in Y on Estimates") 

What do you observe?

Because the measurement error is uncorrelated with X, the OLS estimators are unbiased, but the variance is inflated (larger variances, larger standard errors).

Concluding Remarks